Setup¶
Die folgenden Code-Blöcke können genutzt werden, um die benötigten Abhängigkeiten zu installieren und zu importieren.
%%capture
%pip install -r ../requirements.txt
%%capture
%load_ext pretty_jupyter
%%capture
# Laden der eingesetzten Libraries
import os
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import sweetviz as sv
from IPython.display import display
from itables import init_notebook_mode
from sklearn.linear_model import LinearRegression
init_notebook_mode(all_interactive=True)
# Funktion zur Bestimmung des Geschlechts und Berechnung des Geburtstags
def parse_details(birth_number):
birth_number_str = str(
birth_number
) # Konvertiere birth_number zu einem String, falls notwendig
year_prefix = "19"
month = int(birth_number_str[2:4])
gender = "female" if month > 12 else "male"
if gender == "female":
month -= 50
year = int(year_prefix + birth_number_str[:2])
day = int(birth_number_str[4:6])
birth_day = datetime(year, month, day)
return gender, birth_day
# Berechnung des Alters basierend auf einem Basisjahr
def calculate_age(birth_date, base_date=datetime(1999, 12, 31)):
return (
base_date.year
- birth_date.year
- ((base_date.month, base_date.day) < (birth_date.month, birth_date.day))
)
def regression_results(y_true, y_pred):
# Regression metrics
print('explained_variance: ', round(metrics.explained_variance_score(y_true, y_pred), 4))
print('mean_squared_log_error: ', round(metrics.mean_squared_log_error(y_true, y_pred), 4))
print('r2: ', round(metrics.r2_score(y_true, y_pred), 4))
print('MAE: ', round(metrics.mean_absolute_error(y_true, y_pred), 4))
print('MSE: ', round(metrics.mean_squared_error(y_true, y_pred), 4))
print('RMSE: ', round(np.sqrt(metrics.mean_squared_error(y_true, y_pred)), 4))
Aufgabenstellung¶
Inhalt der hier bearbeiteten und dokumentierten Mini-Challenge für das Modul «aml - Angewandtes Machine Learning» der FHNW ist die Entwicklung und Evaluierung von Affinitätsmodellen für personalisierte Kreditkarten-Werbekampagnen im Auftrag einer Bank. Das Ziel der Authoren ist es also, mithilfe von Kunden- und Transaktionsdaten präzise Modelle zu erstellen, die die Wahrscheinlichkeit des Kreditkartenkaufs einer bestimmten Person vorhersagen.
Laden der zur Verfügung gestellten Daten¶
Zur Verfügung gestellt wurden 8 csv-Dateien von welchen die Beschreibung der erfassten Variablen unter dem folgenden Link eingesehen werden können: PKDD'99 Discovery Challenge - Guide to the Financial Data Set. Nachfolgend werden diese csv-Dateien eingelesen.
account = pd.read_csv("./data/account.csv", sep=";", dtype={"date": "str"})
card = pd.read_csv("./data/card.csv", sep=";", dtype={"issued": "str"})
client = pd.read_csv("./data/client.csv", sep=";")
disp = pd.read_csv("./data/disp.csv", sep=";")
district = pd.read_csv("./data/district.csv", sep=";")
loan = pd.read_csv("./data/loan.csv", sep=";", dtype={"date": "str"})
order = pd.read_csv("./data/order.csv", sep=";")
trans = pd.read_csv("./data/trans.csv", sep=";", dtype={"date": "str", "bank": "str"})
account.csv¶
Der Datensatz accounts.csv beinhaltet die folgenden Informationen über die Kontos der Bank:
account_id: die Kontonummer,district_id: den Standort der entsprechenden Bankfiliale,frequency: die Frequenz der Ausstellung von Kontoauszügen (monatlich, wöchentlich, pro Transaktion) unddate: das Erstellungsdatum
account.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4500 entries, 0 to 4499 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 account_id 4500 non-null int64 1 district_id 4500 non-null int64 2 frequency 4500 non-null object 3 date 4500 non-null object dtypes: int64(2), object(2) memory usage: 140.8+ KB
card.csv¶
Der Datensatz card.csv beinhaltet die folgenden Informationen über die von der Bank herausgegebenen Kreditkarten:
card_id: die Kartennummer,disp_id: die Zuordnung zum entsprechenden Bankkonto und -inhaber (Disposition),type: die Art der Kreditkarte (junior, classic, gold) undissued: das Ausstellungsdatum
card.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 892 entries, 0 to 891 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 card_id 892 non-null int64 1 disp_id 892 non-null int64 2 type 892 non-null object 3 issued 892 non-null object dtypes: int64(2), object(2) memory usage: 28.0+ KB
client.csv¶
Der Datensatz client.csv beinhaltet die folgenden Informationen über die Kunden der Bank:
client_id: die Kundennummer,birth_number: eine Kombination aus Geburtsdatum und Geschlecht sowiedistrict_id: die Adresse
client.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5369 entries, 0 to 5368 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 client_id 5369 non-null int64 1 birth_number 5369 non-null int64 2 district_id 5369 non-null int64 dtypes: int64(3) memory usage: 126.0 KB
disp.csv¶
Der Datensatz disp.csv beinhaltet die folgenden Informationen über die Dispositionen der Bank:
disp_id: der Identifikationsschlüssel der Disposition,client_id: die Kundennummer,account_id: die Kontonummer,type: die Art der Disposition (Inhaber, Benutzer)
disp.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5369 entries, 0 to 5368 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 disp_id 5369 non-null int64 1 client_id 5369 non-null int64 2 account_id 5369 non-null int64 3 type 5369 non-null object dtypes: int64(3), object(1) memory usage: 167.9+ KB
district.csv¶
Der Datensatz district.csv beinhaltet die folgenden demografischen Informationen:
A1: die ID des Distrikts,A2: der Name des Distrikts,A3: die Region,A4: die Anzahl der Einwohner,A5: die Anzahl der Gemeinden mit < 499 Einwohner,A6: die Anzahl der Gemeinden mit 500 - 1999 Einwohner,A7: die Anzahl der Gemeinden mit 2000 - 9999 Einwohner,A8: die Anzahl der Gemeinden mit >10000 Einwohner,A9: die Anzahl Städte,A10: das Verhältnis von städtischen Einwohnern,A11: das durchschnittliche Einkommen,A12: die Arbeitslosenrate vom Jahr 95,A13: die Arbeitslosenrate vom Jahr 96,A14: die Anzahl von Unternehmer pro 1000 Einwohner,A15: die Anzahl von begangenen Verbrechen im Jahr 95,A16: die Anzahl von begangenen Verbrechen im Jahr 96,
district.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 77 entries, 0 to 76 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 A1 77 non-null int64 1 A2 77 non-null object 2 A3 77 non-null object 3 A4 77 non-null int64 4 A5 77 non-null int64 5 A6 77 non-null int64 6 A7 77 non-null int64 7 A8 77 non-null int64 8 A9 77 non-null int64 9 A10 77 non-null float64 10 A11 77 non-null int64 11 A12 77 non-null object 12 A13 77 non-null float64 13 A14 77 non-null int64 14 A15 77 non-null object 15 A16 77 non-null int64 dtypes: float64(2), int64(10), object(4) memory usage: 9.8+ KB
loan.csv¶
Der Datensatz loan.csv beinhaltet die folgenden Informationen über die vergebenen Darlehen der Bank:
loan_id: ID des Darlehens,account_id: die Kontonummer,date: das Datum, wann das Darlehen gewährt wurde,amount: der Betrag,duration: die Dauer des Darlehens,payments: die höhe der monatlichen Zahlungen undstatus: der Rückzahlungsstatus (A: ausgeglichen, B: Vertrag abgelaufen aber nicht fertig bezahlt, C: laufender Vertrag und alles in Ordnung, D: laufender Vertrag und Kunde verschuldet)
loan.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 682 entries, 0 to 681 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 loan_id 682 non-null int64 1 account_id 682 non-null int64 2 date 682 non-null object 3 amount 682 non-null int64 4 duration 682 non-null int64 5 payments 682 non-null float64 6 status 682 non-null object dtypes: float64(1), int64(4), object(2) memory usage: 37.4+ KB
order.csv¶
Der Datensatz order.csv beinhaltet die folgenden Informationen über die Daueraufträge eines Kontos:
order_id: die Nummer des Dauerauftrags,account_id: die Kontonummer von welchem der Auftrag stammt,bank_to: die empfangende Bank,account_to: das empfangende Konto,amount: der Betrag,k_symbol: die Art des Auftrags (Versicherungszahlung, Haushalt, Leasing, Darlehen)
order.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6471 entries, 0 to 6470 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 6471 non-null int64 1 account_id 6471 non-null int64 2 bank_to 6471 non-null object 3 account_to 6471 non-null int64 4 amount 6471 non-null float64 5 k_symbol 6471 non-null object dtypes: float64(1), int64(3), object(2) memory usage: 303.5+ KB
trans.csv¶
Der Datensatz trans.csv beinhaltet die folgenden Informationen über die Transaktionen eines Kontos:
trans_id: die ID der Transaktion,account_id: die Kontonummer des ausführenden Kontos,date: das Datum,type: der Typ (Einzahlung, Bezug)operation: die Art der Transaktion (Bezug Kreditkarte, Bareinzahlung, Bezug über eine andere Bank, Bezug Bar, Überweisung)amount: der Betrag der Transaktion,balance: der Kontostand nach ausführung der Transaktion,k_symbol: die Klassifikation der Transaktion (Versicherungszahlung, Kontoauszug, Zinsauszahlung, Zinszahlung bei negativem Kontostand, Haushalt, Pension, Darlehensauszahlung),bank: die empfangende Bank undaccount: das empfangende Bankkonto
trans.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1056320 entries, 0 to 1056319 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 trans_id 1056320 non-null int64 1 account_id 1056320 non-null int64 2 date 1056320 non-null object 3 type 1056320 non-null object 4 operation 873206 non-null object 5 amount 1056320 non-null float64 6 balance 1056320 non-null float64 7 k_symbol 574439 non-null object 8 bank 273508 non-null object 9 account 295389 non-null float64 dtypes: float64(3), int64(2), object(5) memory usage: 80.6+ MB
Transformationen¶
Im folgenden Abschnitt werden die geladenen Daten separat so transformiert, dass jede Zeile einer Observation und jede Spalte einer Variable im entsprechenden Datenformat entspricht, also ins Tidy-Format gebracht.
data_frames = {}
Account¶
Nachfolgend wird die date Spalte des account.csv-Datensatzes in das entsprechende Datenformat geparsed und die Werte von frequency übersetzt und als Levels einer Kategorie definiert.
# parse date
account["date"] = pd.to_datetime(account["date"], format="%y%m%d")
# translate categories
account["frequency"] = account["frequency"].replace(
{
"POPLATEK MESICNE": "monthly",
"POPLATEK TYDNE": "weekly",
"POPLATEK PO OBRATU": "transactional",
}
)
# convert column frequency to categorical
account["frequency"] = account["frequency"].astype("category")
# append account data to dataframe collection
data_frames["account.csv"] = account
# sample 5 random rows
account.sample(n=5)
| account_id | district_id | frequency | date | |
|---|---|---|---|---|
| Loading... (need help?) |
%%capture
# generate sweetviz report
svReport_account = sv.analyze(account)
svReport_account.show_html(filepath="./reports/accounts.html", open_browser=False)
TODO eda Beschreib¶
Card¶
Auch bei diesem Datensatz (card.csv) werden zunächst die Datentypen korrigiert um anschliessend die Inhalte entsprechend beschreiben zu können
# parse date
card["issued"] = pd.to_datetime(card["issued"].str[:6], format="%y%m%d")
# convert type to categorical
card["type"] = card["type"].astype("category")
# append to dataframes collection
data_frames["card.csv"] = card
%%capture
# generate sweetviz report
svReport_card = sv.analyze(card)
svReport_card.show_html(filepath="./reports/card.html", open_browser=False)
TODO eda¶
Client¶
Die Spalte birth_number des client.csv-Datensatzes codiert 3 Features der Bankkunden: Geschlecht, Geburtsdatum und damit auch das Alter. Diese Informationen werden mithilfe der zuvor definierten Funktionen parse_details() und calculate_age extrahiert.
# Geburtstag & Geschlecht aus birth_number extrahieren
client["gender"], client["birth_day"] = zip(
*client["birth_number"].apply(parse_details)
)
client["gender"] = client["gender"].astype("category")
# Alter berechnen
client["age"] = client["birth_day"].apply(calculate_age)
data_frames["client.csv"] = client
# Spalte birth_number entfernen
client = client.drop(columns=["birth_number"])
# Sample 5 random rows
client.sample(n=5)
| client_id | district_id | gender | birth_day | age | |
|---|---|---|---|---|---|
| Loading... (need help?) |
%%capture
svReport_client = sv.analyze(client)
svReport_client.show_html(filepath="./reports/client.html", open_browser=False)
TODO eda¶
Disp¶
Auch die Variablen des Datensatzes disp.csv werden in die korrekten Datentypen übertragen.
# Spalte type als Kategorie speichern
disp["type"] = disp["type"].astype("category")
data_frames["disp.csv"] = disp
# random sample
disp.sample(n=5)
| disp_id | client_id | account_id | type | |
|---|---|---|---|---|
| Loading... (need help?) |
%%capture
svReport_disp = sv.analyze(disp)
svReport_disp.show_html(filepath="./reports/disp.html", open_browser=False)
TODO eda¶
District¶
Auffällig ist, dass nebst den Spalten A2 (dem Namen) und A3 (der Region) die Spalten A12 und A15 den Datentyp object erhalten. Das ist, weil jeweils ein fehlender Wert vorhanden ist, welcher mit einem ? gekennzeichnet ist.
Zunächst benennen wir die Spaltennamen in sprechendere um.
# Spalten umbenennen
district = district.rename(
columns={
"A1": "district_id",
"A2": "district_name",
"A3": "region",
"A4": "num_of_habitat",
"A5": "num_of_small_town",
"A6": "num_of_medium_town",
"A7": "num_of_big_town",
"A8": "num_of_bigger_town",
"A9": "num_of_city",
"A10": "ratio_of_urban",
"A11": "average_salary",
"A12": "unemploy_rate95",
"A13": "unemploy_rate96",
"A14": "n_of_enterpren_per1000_inhabit",
"A15": "no_of_crimes95",
"A16": "no_of_crimes96",
}
)[
[
"district_id",
"district_name",
"region",
"num_of_habitat",
"num_of_small_town",
"num_of_medium_town",
"num_of_big_town",
"num_of_bigger_town",
"num_of_city",
"ratio_of_urban",
"average_salary",
"unemploy_rate95",
"unemploy_rate96",
"n_of_enterpren_per1000_inhabit",
"no_of_crimes95",
"no_of_crimes96",
]
]
district["region"] = district["region"].astype("category")
district["district_name"] = district["district_name"].astype("category")
district.sample(n=5)
| district_id | district_name | region | num_of_habitat | num_of_small_town | num_of_medium_town | num_of_big_town | num_of_bigger_town | num_of_city | ratio_of_urban | average_salary | unemploy_rate95 | unemploy_rate96 | n_of_enterpren_per1000_inhabit | no_of_crimes95 | no_of_crimes96 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
# die fehlenden Werte anzeigen
district[district.isin(["?"]).any(axis=1)]
| district_id | district_name | region | num_of_habitat | num_of_small_town | num_of_medium_town | num_of_big_town | num_of_bigger_town | num_of_city | ratio_of_urban | average_salary | unemploy_rate95 | unemploy_rate96 | n_of_enterpren_per1000_inhabit | no_of_crimes95 | no_of_crimes96 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
Wir gehen davon aus, dass es sich hier um effektiv fehlende Werte handelt und nicht um zensierte Daten, also Werte, für welche der exakte Wert fehlt, aber trotzdem Informationen vorhanden sind. In diesem Fall, wenn die Variable mit den fehlenden Werten eine hohe Korrelation mit anderen Prediktoren aufweist, bietet es sich an, KNN oder eine einfache lineare Regression für die Imputation anzuwenden. [1]
Die Korrelationsmatrix des SweetViz Reports zeigt, dass unemploy_rate95 stark mit unemploy_rate96 und no_of_crimes95 mit no_of_crimes96 korreliert.
# die ? ersetzen mit NaN
district = district.replace("?", np.nan)
# Datentyp korrigieren
district["no_of_crimes95"] = district["no_of_crimes95"].astype(float)
district["unemploy_rate95"] = district["unemploy_rate95"].astype(float)
# Korrelation zwischen Arbeitslosenquote 95 und 96
district[["unemploy_rate95", "unemploy_rate96"]].corr()
| unemploy_rate95 | unemploy_rate96 | |
|---|---|---|
| Loading... (need help?) |
# Korrelation zwischen Anzahl Verbrechen 95 und 96
district[["no_of_crimes95", "no_of_crimes96"]].corr()
| no_of_crimes95 | no_of_crimes96 | |
|---|---|---|
| Loading... (need help?) |
# Zeilen filtern, sodass keine fehlenden Werte vorhanden sind
district_no_na = district[district["unemploy_rate95"].notnull()]
# Lineares regressions Modell erstellen
lin_reg_unemploy = LinearRegression()
# Modell fitten
lin_reg_unemploy.fit(
district_no_na["unemploy_rate96"].values.reshape(-1, 1),
district_no_na["unemploy_rate95"].values,
)
# Modell evaluieren
regression_results(district_no_na["unemploy_rate95"],
lin_reg_unemploy.predict(district_no_na["unemploy_rate96"].values.reshape(-1, 1)))
explained_variance: 0.9634 mean_squared_log_error: 0.0051 r2: 0.9634 MAE: 0.231 MSE: 0.1002 RMSE: 0.3166
# Lineares regressions Modell erstellen
lin_reg_crime = LinearRegression()
# Modell fitten
lin_reg_crime.fit(
district_no_na["no_of_crimes96"].values.reshape(-1, 1),
district_no_na["no_of_crimes95"].values,
)
# Modell evaluieren
regression_results(district_no_na["no_of_crimes95"],
lin_reg_unemploy.predict(district_no_na["no_of_crimes96"].values.reshape(-1, 1)))
explained_variance: 0.9968 mean_squared_log_error: 0.0211 r2: 0.9948 MAE: 474.5189 MSE: 501843.8397 RMSE: 708.4094
# Vorhersage der fehlenden Werte
district.loc[district["no_of_crimes95"].isnull(), "no_of_crimes95"] = lin_reg_unemploy.predict(
district[district["no_of_crimes95"].isnull()]["no_of_crimes96"].values.reshape(-1, 1)
)
district.loc[district["unemploy_rate95"].isnull(), "unemploy_rate95"] = lin_reg_crime.predict(
district[district["unemploy_rate95"].isnull()]["unemploy_rate96"].values.reshape(-1, 1)
)
# Anzahl der Zeilen mit fehlenden Werten zählen
sum(district[district.isin([np.nan]).any(axis=1)].count())
0
%%capture
svReport_district = sv.analyze(district)
svReport_district.show_html(filepath="./reports/district.html", open_browser=False)
Loan¶
loan["date"] = pd.to_datetime(loan["date"], format="%y%m%d")
# Mutate the 'status' column based on conditions
loan["status"] = loan["status"].map(
{
"A": "contract finished",
"B": "finished contract, loan not paid",
"C": "running contract",
"D": "client in debt",
}
)
loan["status"] = loan["status"].astype("category")
# Group by 'account_id', calculate the number of loans, and sort the results
num_of_loan_df = (
loan.groupby("account_id")
.size()
.reset_index(name="num_of_loan")
.sort_values(by="num_of_loan", ascending=False)
)
# Display the resulting DataFrame
num_of_loan_df
| account_id | num_of_loan | |
|---|---|---|
| Loading... (need help?) |
# Perform an inner join between 'loan' and 'num_of_loan_df' on 'account_id'
loan = pd.merge(loan, num_of_loan_df, on="account_id", how="inner")
# Assign the resulting DataFrame to a dictionary for storage
data_frames["loan.csv"] = loan
# Sample 5 random rows from the joined DataFrame
loan.sample(n=100)
| loan_id | account_id | date | amount | duration | payments | status | num_of_loan | |
|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
%%capture
svReport_loan = sv.analyze(loan)
svReport_loan.show_html(filepath="./reports/loan.html", open_browser=False)
Order¶
order
| order_id | account_id | bank_to | account_to | amount | k_symbol |
|---|---|---|---|---|---|
| Loading... (need help?) |
# Assuming 'order' and 'account' DataFrames are already loaded
# Correctly map and fill missing values in 'k_symbol' column
order["k_symbol"] = (
order["k_symbol"]
.map(
{
"POJISTNE": "insurance_payment",
"SIPO": "household",
"UVER": "loan_payment",
"LEASING": "leasing",
}
)
.fillna("unknown")
)
# Merge with 'account_id_df' to ensure all accounts are represented
order = pd.merge(account[["account_id"]], order, on="account_id", how="left")
# After merging, fill missing values that may have been introduced
order["k_symbol"] = order["k_symbol"].fillna("unknown")
order["amount"] = order["amount"].fillna(0)
order["has_order"] = ~order.isna().any(axis=1)
orders_pivot = order.pivot_table(
index="account_id", columns="k_symbol", values="amount", aggfunc="sum"
)
# Add prefix to column names
orders_pivot.columns = orders_pivot.columns
orders_pivot = orders_pivot.reset_index()
# Assuming data_frames is a dictionary for storing DataFrames
data_frames["order.csv"] = orders_pivot
# NaN to 0
data_frames["order.csv"] = data_frames["order.csv"].fillna(0)
# Sample 5 random rows from the merged DataFrame
data_frames["order.csv"].sample(n=10)
| k_symbol | account_id | household | insurance_payment | leasing | loan_payment | unknown |
|---|---|---|---|---|---|---|
| Loading... (need help?) |
data_frames["order.csv"].columns
Index(['account_id', 'household', 'insurance_payment', 'leasing',
'loan_payment', 'unknown'],
dtype='object', name='k_symbol')
%%capture
svReport_order = sv.analyze(order)
svReport_order.show_html(filepath="./reports/order.html", open_browser=False)
Trans¶
trans["date"] = pd.to_datetime(trans["date"], format="%y%m%d")
# Convert 'date' from string to datetime
trans["date"] = pd.to_datetime(trans["date"])
# Update 'type' column
trans["type"] = trans["type"].replace({"PRIJEM": "credit", "VYDAJ": "withdrawal"})
# Update 'operation' column
trans["operation"] = trans["operation"].replace(
{
"VYBER KARTOU": "credit card withdrawal",
"VKLAD": "credit in cash",
"PREVOD Z UCTU": "collection from another bank",
"VYBER": "cash withdrawal",
"PREVOD NA UCET": "remittance to another bank",
}
)
# Update 'k_symbol' column
trans["k_symbol"] = trans["k_symbol"].replace(
{
"POJISTNE": "insurance payment",
"SLUZBY": "statement payment",
"UROK": "interest credited",
"SANKC. UROK": "sanction interest if negative balance",
"SIPO": "household payment",
"DUCHOD": "pension credited",
"UVER": "loan payment",
}
)
# negate the amount if type is credit
trans.loc[trans['type'] == 'credit', 'amount'] = trans.loc[trans['type'] == 'credit', 'amount'] * (-1)
# Assign to a dictionary if needed (similar to list assignment in R)
data_frames["trans.csv"] = trans
# Sample 5 random rows from the DataFrame
trans.sample(n=1000)
trans
| trans_id | account_id | date | type | operation | amount | balance | k_symbol | bank | account |
|---|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
# Plot Zeitliche Entwicklung des Konto-Saldos für die Konto nummer 19
account_19 = trans[trans["account_id"] == 19].copy() # Create a copy of the DataFrame
# Ensure the date column is in datetime format
account_19["date"] = pd.to_datetime(account_19["date"])
# Sort the values by date
account_19 = account_19.sort_values("date")
plt.figure(figsize=(10, 6))
plt.plot(account_19["date"], account_19["balance"])
plt.title("Time evolution of balance for account number 19")
plt.xlabel("Date")
plt.ylabel("Balance")
plt.show()
# zoom the year 1995 of the plot
account_19_1995 = account_19[account_19["date"].dt.year == 1995]
# plot it
plt.figure(figsize=(10, 6))
plt.plot(account_19_1995["date"], account_19_1995["balance"])
plt.title("Time evolution of balance for account number 19 in 1995")
plt.xlabel("Date")
plt.ylabel("Balance")
plt.show()
# Wee see that there is a steep line in 1995-10 so there are two transactions, this we have to clean.
%%capture
svReport_trans = sv.analyze(trans)
svReport_trans.show_html(filepath="./reports/trans.html", open_browser=False)
Explorative Datenanalyse¶
In diesem Abschnitt wird mittels EDA ein Überblick über die eingelesenen Daten gewonnen.
D&Q¶
# Check for missing values in each DataFrame
for df_name, df in data_frames.items():
print(f"Missing values in {df_name}:")
print(df.isna().sum().sum()) # Sum of all missing values in the DataFrame
Missing values in account.csv: 0 Missing values in card.csv: 0 Missing values in client.csv: 0 Missing values in disp.csv: 0 Missing values in loan.csv: 0 Missing values in order.csv: 0 Missing values in trans.csv: 2208738
Merge the Dataframe['XXX'] for non transaction Data¶
# merge dataframes
non_transactional_data = (
data_frames["disp.csv"]
.add_suffix("_disp")
.merge(
data_frames["account.csv"].add_suffix("_account"),
left_on="account_id_disp",
right_on="account_id_account",
how="left",
)
.merge(
data_frames["card.csv"].add_suffix("_card"),
left_on="disp_id_disp",
right_on="disp_id_card",
how="left",
)
.merge(
data_frames["loan.csv"].add_suffix("_loan"),
left_on="account_id_disp",
right_on="account_id_loan",
how="left",
)
.merge(
data_frames["order.csv"].add_suffix("_order"),
left_on="account_id_disp",
right_on="account_id_order",
how="left",
)
)
non_transactional_data.columns
Index(['disp_id_disp', 'client_id_disp', 'account_id_disp', 'type_disp',
'account_id_account', 'district_id_account', 'frequency_account',
'date_account', 'card_id_card', 'disp_id_card', 'type_card',
'issued_card', 'loan_id_loan', 'account_id_loan', 'date_loan',
'amount_loan', 'duration_loan', 'payments_loan', 'status_loan',
'num_of_loan_loan', 'account_id_order', 'household_order',
'insurance_payment_order', 'leasing_order', 'loan_payment_order',
'unknown_order'],
dtype='object')
cols_to_replace_na = [
"household_order",
"insurance_payment_order",
"loan_payment_order",
"leasing_order",
"unknown_order",
]
non_transactional_data[cols_to_replace_na] = non_transactional_data[
cols_to_replace_na
].fillna(0)
Dropping of Junior Cards that are not on the edge to a normal card Analyse¶
# join district and client left join on district_id
non_transactional_data = non_transactional_data.merge(
data_frames["district.csv"],
left_on="district_id_account",
right_on="district_id",
how="left",
)
non_transactional_data
# merge client with suffix
non_transactional_data = non_transactional_data.merge(
data_frames["client.csv"].add_suffix("_client"),
left_on="client_id_disp",
right_on="client_id_client",
how="left",
)
non_transactional_data["has_card"] = ~non_transactional_data["card_id_card"].isna()
# Filter rows where 'has_card' is True
filtered_data = non_transactional_data[non_transactional_data["has_card"]]
# Check if there are duplicated 'account_id' in the filtered data
duplicated_account_id = filtered_data["account_id_account"].duplicated().sum()
print(duplicated_account_id)
Junior Cards removal¶
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
display(non_transactional_data)
# Filter rows where 'card_type' contains 'junior' (case insensitive)
junior_cards = non_transactional_data[
non_transactional_data["type_card"].str.contains("junior", case=False, na=False)
]
display(junior_cards)
# Calculate age at issue
junior_cards["age_at_issue"] = (
junior_cards["issued_card"] - junior_cards["birth_day_client"]
).dt.days // 365
# Plot histogram
plt.figure(figsize=(10, 6))
sns.histplot(data=junior_cards, x="age_at_issue", bins=20)
plt.title("Age distribution at issue date of junior cards")
plt.xlabel("Age at issue date")
plt.ylabel("Number of cards")
plt.show()
In the advertising campaign, we do not want to promote children's/junior cards (for whatever reasons). First, I looked at the distribution of age at issuance. Here I see that there are not many junior cards, nor are the cards issued at a late age.
num_accounts_before = len(non_transactional_data)
# Filter rows where 'card_type' does not contain 'junior' (case insensitive)
non_transactional_data = non_transactional_data[
~non_transactional_data["type_card"].str.contains("junior", case=False, na=False)
]
num_accounts_after = len(non_transactional_data)
num_junior_cards = num_accounts_before - num_accounts_after
print(f"Number of junior cards removed: {num_junior_cards}")
%%capture
import subprocess
import pathlib
try:
file_path = pathlib.Path(os.path.basename(__file__))
except:
file_path = pathlib.Path("AML_MC.ipynb")
# Check the file extension
if file_path.suffix == ".py":
# If it's a Python script, convert it to a notebook
try:
subprocess.check_output(["jupytext", "--to", "notebook", str(file_path)])
print("Converted to notebook.")
except subprocess.CalledProcessError as e:
print("Conversion failed. Error message:", e.output)
elif file_path.suffix == ".ipynb":
# If it's a notebook, convert it to a Python script with cell markers
try:
subprocess.check_output(["jupytext", "--to", "py:percent", str(file_path)])
print("Converted to Python script.")
except subprocess.CalledProcessError as e:
print("Conversion failed. Error message:", e.output)
else:
print("Unsupported file type.")
# Update html output
# jupyter nbconvert --to html --template pj AML_MC.ipynb